""" Code to run BR and ECC classifiers on Human rights data """

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import normalize
import pandas as pd
import sklearn.metrics as metrics
from sklearn.svm import SVC
from scipy.sparse import csr_matrix
from skmultilearn.ensemble import RakelD
import numpy as np
from skmultilearn.problem_transform import LabelPowerset
import sys
import warnings
warnings.filterwarnings('ignore')


def run_BR(X_all, y_all, labels,
           seeds=list(range(5)),
           function_name='logistic', pct=100):
    """
    Runs Binary Relevance classifier on HR data

    Inputs
        X_all : matrix of terms
        y_all : target labels (train and test)
        label_cols : list of labels' names
        function_name : type of base classifier
        pct : percentage of features to be used
    ------------
    Returns
        df_all : Dataframe with results for 4 metrics:
                 subset accuracy, F1-macro, F1-micro and ranking loss

    """
    temp = []
    print('Running BR \n')
    for i, seed in enumerate(seeds):

        all_acc = {}
        print(f'Iteration {i+1} of {len(seeds)}')
      
        if function_name == 'SVM':
            model = BinaryRelevance(SVC(kernel='linear'))
        elif function_name == 'RF':
            model = BinaryRelevance(RandomForestClassifier())
        else:
            model = BinaryRelevance(LogisticRegression(C=1, solver='lbfgs',
                                                       max_iter=1000))
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all[labels].values, test_size=0.2,
            random_state=seed)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        pred_probs = model.predict_proba(X_test)

        all_acc['Subset Accuracy'] = 100*metrics.accuracy_score(
            y_test, y_pred.A)
        all_acc['Micro-F1'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='micro')
        all_acc['Macro-F1'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='macro')
        all_acc['Hamming Loss'] = 100*metrics.hamming_loss(y_test, y_pred.A)
        all_acc['Ranking Loss'] = metrics.label_ranking_loss(
            y_test, pred_probs.A)
    
        all_acc['percentage_features'] = pct
        all_acc['algo'] = 'BR'
    
        temp.append(all_acc)

    df_final = pd.DataFrame(temp)
    print(f'Metrics mean : \n{df_final.mean(axis=0)}')

    return df_final


def run_ECC(
        X, y_all, label_cols, seeds=list(range(5)),
        function_name='logistic', pct=100):
    """
    Runs ECC classifier on HR data

    Inputs
        X_all : matrix of terms
        y_all : target labels (train and test)
        label_cols : list of labels' names
        function_name : type of base classifier
        pct : percentage of features to be used
    ------------
    Returns
        df_all : Dataframe with results for 4 metrics:
                 subset accuracy, F1-macro, F1-micro and ranking loss

    """
    print('Running ECC \n')
    y_all = y_all[label_cols]
    temp = []
    for i, seed in enumerate(seeds):

        all_acc = {}
        print(f'Iteration {i+1} of {len(seeds)}')
        outputs_list = []
        for k in range(5):
            permute = np.random.permutation(len(label_cols))
            reorder = np.argsort(permute)
    
            if function_name == 'SVM':
                model = ClassifierChain(SVC(kernel='linear', probability=True))
            elif function_name == 'RF':
                model = ClassifierChain(RandomForestClassifier())
            else:
                model = ClassifierChain(LogisticRegression(
                    C=1,
                    solver='lbfgs', max_iter=500))

            X_train, X_test, y_train, y_test = train_test_split(
                X, y_all.iloc[:, permute].values, test_size=0.2,
                random_state=seed)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            pred_probs = model.predict_proba(X_test).A

            outputs_list.append(pred_probs[:, reorder])

        X_train, X_test, y_train, y_test = train_test_split(
                X, y_all[label_cols].values, test_size=0.2,
                random_state=seed)

        ens_out = np.mean(outputs_list, axis=0)
        labels_out = 1 * (ens_out > 0.5)

        all_acc['Subset Accuracy'] = 100*metrics.accuracy_score(
            y_test, labels_out)
        all_acc['Micro-F1'] = 100*metrics.f1_score(
            y_test, labels_out, average='micro')
        all_acc['Macro-F1'] = 100*metrics.f1_score(
            y_test, labels_out, average='macro')
        all_acc['Hamming Loss'] = 100*metrics.hamming_loss(y_test, labels_out)
        all_acc['Ranking Loss'] = metrics.label_ranking_loss(y_test, ens_out)
        all_acc['percentage_features'] = pct
        all_acc['algo'] = 'ECC'

        temp.append(all_acc)

    df_final = pd.DataFrame(temp)
    return df_final
    print(f'Metrics mean : \n{df_final.mean(axis=0)}')


def main(number_iterations=10, pcts=[1, 0.8, 0.6, 0.4, 0.2]):
    """
    Runs both BR and ECC classifiers on human data
    Saves results in the results folder
    -----------
    Inputs
    number_iterations : int, how many times each classifier will run
    pcts : list of floats, the percentage of features that will be used
    """

    df = pd.read_csv('../data_clean/human_rights.csv')
    y_all = df.loc[:, 'disap':'injud']
    X_all = normalize(df.loc[:, 'abolish':].values)
    X_all = csr_matrix(X_all)

    labels = list(y_all)

    all_results = pd.DataFrame()
    for pct in pcts:
        print(f'Running models with {pct*100}% of features ')
        subset_feat = np.random.choice(
            X_all.shape[1], int(X_all.shape[1]*pct), replace=False)
        temp_br = run_BR(
            X_all[:, subset_feat], y_all, labels, seeds=list(
            range(number_iterations)), function_name='SVC', pct=pct)
        temp_ecc = run_ECC(
            X_all[:, subset_feat], y_all, labels, seeds=list(
            range(number_iterations)), function_name='RF', pct=pct)
        all_results = pd.concat([all_results, temp_br, temp_ecc])

    all_results.to_csv('../results/Human Rights/hr_results.csv')


if __name__ == "__main__":

    if len(sys.argv) > 1:
        main(number_iterations=int(sys.argv[1]))
    else:
        main()
